1. Import Data and Library

library(Seurat)
## Attaching SeuratObject
library(SeuratData)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## ── Installed datasets ───────────────────────────────────── SeuratData v0.2.1 ──
## ✓ bmcite 0.3.0
## ────────────────────────────────────── Key ─────────────────────────────────────
## ✓ Dataset loaded successfully
## > Dataset built with a newer version of Seurat than installed
## ❓ Unknown version of Seurat installed
library(cowplot)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(SAVER)
# Just an example way to preprocess data, to demonstrate the PCA part
# # You could use SCTransform into RunPCA also
bm <- LoadData(ds = "bmcite")
## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used

## Warning in if (is.na(desc)) {: the condition has length > 1 and only the first
## element will be used
orig_count <- Matrix::t(bm@assays$RNA@counts)
celltype_factor <- as.numeric(as.factor(bm@meta.data$celltype.l2))

load("CD4_seurat.Rdata")
saver_mat <- Matrix::t(saver_dat$estimate)
# high_var <- FindVariableFeatures(bm,
#                                  selection.method = "vst", nfeatures = 2000)
# high_gene <- high_var@assays$RNA@var.features
# CD4_ind <- which(as.factor(bm@meta.data$celltype.l2) == "CD4 Naive")
# 
# sub_dat <- Matrix::t(orig_count[CD4_ind, high_gene])
# 
# saver_dat <- SAVER::saver(sub_dat)
# save(saver_dat, store_date, high_gene, CD4_ind, file = "CD4_seurat.Rdata")
# store_date <- date()
# cell_labels <- unique(bm@meta.data$celltype.l2)
# cell_type <- bm@meta.data$celltype.l2
# rand_ind <- c()
# 
# for (cell in cell_labels){
#   set.seed(10)
#   subcell_ind <- which(cell_type == cell)
#   subcell_len <- length(subcell_ind)
#   subcell_mat <- orig_count[subcell_ind, ]
# 
#   row_ind <- apply(orig_count, 1, function(x){length(which(x != 0))})
#   idx <- order(row_ind, decreasing = T)
#   sub_rand <- sample(length(subcell_ind),
#                      length(subcell_ind)/40)
#   rand_ind <- c(rand_ind, idx[1:(subcell_len/30)])
# }
# 
# sub_dat <- orig_count[rand_ind, ]
# 
# col_ind <- apply(sub_dat, 2, function(x){length(which(x != 0))})
# idx <- order(col_ind, decreasing = T)[1:500]
# 
# sub_dat <- sub_dat[, idx]
# 
# dat_hclust <- hclust(dist(t(as.matrix(sub_dat))))
# dat_index <- dat_hclust$order
# 
# sub_dat <- sub_dat[, dat_index]
# sub_celltype <- cell_type[rand_ind]
# sub_cluster_labels <- as.numeric(as.factor(sub_celltype))

dim(saver_mat)
## [1] 4500 2000
first_filter <- apply(saver_mat, 2, function(x) {sd(x)})
first_filter_ind <- which(first_filter != 0)
saver_mat <- saver_mat[, first_filter_ind]
dim(saver_mat)
## [1] 4500 1805

Apply SAVER

# saver_dat <- SAVER::saver(t(as.matrix(sub_dat)))
# saver_orig <- SAVER::saver(orig_count)
# save(bm, saver_dat, cell_type, sub_dat, sub_celltype, store_date, file = "SAVER_seurat.Rdata")

2-1. Dependency Measures

library(reshape2) # melt function
library(ggplot2) # ggplot function
library(pcaPP) # Fast Kendall function
library(energy) # Distance Correlation
library(Hmisc) # Hoeffding's D measure
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:SeuratObject':
## 
##     Key
## The following object is masked from 'package:Seurat':
## 
##     Key
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(zebu) # Normalized Mutual Information
# library(minerva) # Maximum Information Coefficient
library(XICOR) # Chatterjee's Coefficient
# library(dHSIC) # Hilbert Schmidt Independence Criterion
library(VineCopula) # Blomqvist's Beta

make_cormat <- function(dat_mat){
matrix_dat <- matrix(nrow = ncol(dat_mat), ncol = ncol(dat_mat))
  cor_mat_list <- list()
  
  basic_cor <- c("pearson", "spearman")
  # find each of the correlation matrices with Pearson, Spearman, Kendall Correlation Coefficients
  for (i in 1:2){
    print(i)
    cor_mat <- stats::cor(dat_mat, method = basic_cor[i])
    cor_mat[upper.tri(cor_mat, diag = T)] <- NA
    cor_mat_list[[i]] <- cor_mat
  }
  
  # functions that take matrix or data.frame as input
  no_loop_function <- c(pcaPP::cor.fk, Hmisc::hoeffd, 
                        VineCopula::BetaMatrix)
  for (i in 3:5){
    print(i)
    fun <- no_loop_function[[i-2]]
    cor_mat <- fun(dat_mat)
    if (i == 4){ # Hoeffding's D
      cor_mat <- cor_mat$D
    }
    cor_mat[upper.tri(cor_mat, diag = T)] <- NA
    cor_mat_list[[i]] <- cor_mat
  }
  
  # functions that take two variables as input to calculate correlations.
  need_loop <- c(zebu::lassie, energy::dcor2d, XICOR::calculateXI)

  for (i in 6:8){
    print(i)
    fun <- need_loop[[i-5]]
    
    cor_mat <- matrix(nrow = ncol(dat_mat),
                      ncol = ncol(dat_mat))
    
    for (j in 2:ncol(dat_mat)){
      for (k in 1:(j-1)){
        if (i == 6){
          cor_mat[j, k] <- fun(cbind(dat_mat[, j], dat_mat[, k]), continuous=c(1,2), breaks = 6, measure = "npmi")$global

        } else {
          cor_mat[j, k] <- fun(as.numeric(dat_mat[, j]),
                               as.numeric(dat_mat[, k]))
        }
      }
    }
    
    cor_mat[upper.tri(cor_mat, diag = T)] <- NA
    cor_mat_list[[i]] <- cor_mat
  }
  return(cor_mat_list)
}

draw_heatmap <- function(cor_mat){
    len <- 5
    melted_cormat <- melt(cor_mat)
    melted_cormat <- melted_cormat[!is.na(melted_cormat$value),]
    break_vec <- round(as.numeric(quantile(melted_cormat$value,
                                           probs = seq(0, 1, length.out = len),
                                           na.rm = T)),
                       4)
    break_vec[1] <- break_vec[1]-1
    break_vec[len] <- break_vec[len]+1
    melted_cormat$value <- cut(melted_cormat$value, breaks = break_vec)
    heatmap_color <- unique(melted_cormat$value)
  
    heatmap <- ggplot(data = melted_cormat, aes(x = Var2, y = Var1, fill = value))+
      geom_tile(colour = "Black") +
      ggplot2::scale_fill_manual(breaks = sort(heatmap_color), 
                                 values = rev(scales::viridis_pal(begin = 0, end = 1)
                                              (length(heatmap_color)))) +
      theme_bw() + # make the background white
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.ticks = element_blank(),
            # erase tick marks and labels
            axis.text.x = element_blank(), axis.text.y = element_blank())
    
    return (heatmap)
}

make_cor_heatmap <- function(dat_mat, cor_mat_list){
  fun_lable <- c("Pearson's Correlation", "Spearman's Correlation", "Kendall's Correlation",
                 "Hoeffding's D", "Blomqvist's Beta", "NMI",
                  "Distance Correlation", "XI Correlation")
  
  cor_heatmap_list <- list()
  cor_abs_heatmap_list <- list()
  
  # make correlation matrices
  #cor_mat_list <- make_cormat(dat_mat)
  
  for (i in 1:8){
    print(i)
    if(i == 5) {
      cor_heatmap_list[[i]] <- NULL
      cor_abs_heatmap_list[[i]] <- NULL
      next 
    }
    cor_mat <- abs(cor_mat_list[[i]])
    
    # get heatmaps
    cor_heatmap <- draw_heatmap(cor_mat)
    
    # ggplot labels
    ggplot_labs <- labs(title = paste("Heatmap of", fun_lable[i]),
                      x = "",
                      y = "",
                      fill = "Coefficient") # change the title and legend label
      
    cor_heatmap_list[[i]] <- cor_heatmap + ggplot_labs
    
    if (i %in% c(1,2,3,4,6)){
      cor_abs_mat <- abs(cor_mat_list[[i]])
      cor_abs_heatmap <- draw_heatmap(cor_abs_mat)
      ggplot_abs_labs <- labs(title = paste("Abs Heatmap of", fun_lable[i]),
                              x = "", # change the title and legend label
                              y = "", 
                              fill = "Coefficient") 
      cor_abs_heatmap_list[[i]] <- cor_abs_heatmap + ggplot_abs_labs
    } else {
      ggplot_abs_labs <- labs(title = paste("Abs Heatmap of", fun_lable[i]),
                              subtitle = "Equivalent to Non-Abs Heatmap",
                              x = "", # change the title and legend label
                              y = "", 
                              fill = "Coefficient") 
      cor_abs_heatmap_list[[i]] <- cor_heatmap + ggplot_abs_labs
    }
  }
  
  ans <- list(cor_heatmap_list, cor_abs_heatmap_list)
  
  return (ans)
}
load("CD4_seurat_corr2.RData")
# save(cormat_list, heatmap_list, saver_mat, store_date,
#     file = "CD4_seurat_corr2.RData")

# cormat_list <- make_cormat(saver_mat)
# heatmap_list <- make_cor_heatmap(saver_mat, cormat_list)

cor_pearson_mat <- cormat_list[[1]]; cor_spearman_mat <- cormat_list[[2]];
cor_kendall_mat <- cormat_list[[3]]; cor_hoeffd_mat <- cormat_list[[4]];
cor_blomqvist_mat <- cormat_list[[5]]; cor_MI_mat <- cormat_list[[6]];
cor_dist_mat <- cormat_list[[7]]; cor_XI_mat <- cormat_list[[8]];

1. Pearson’s correlation coefficient

  • Pearson’s correlation is to measure linear dependency of data, X and Y
  • \(-1 \leq \rho_{Pearson}(X, Y) \leq 1\)
  • \(\rho_{Pearson}(X, Y) = \frac{\sum(x_i-\bar{x})(y_i -\bar{y})}{\sum(x_i-\bar{x})^2(y_i -\bar{y})^2}\)
cor_pearson_mat[1:5,1:5]
##             IGKC       HBA2         HBB       HBA1 IGHA1
## IGKC          NA         NA          NA         NA    NA
## HBA2  0.03360766         NA          NA         NA    NA
## HBB   0.02048294 0.96939713          NA         NA    NA
## HBA1  0.06589510 0.97633835 0.986972826         NA    NA
## IGHA1 0.50242775 0.01596036 0.007303465 0.03546479    NA
quantile(cor_pearson_mat, na.rm = T)
##           0%          25%          50%          75%         100% 
## -0.790171530 -0.006763938 -0.001996166  0.004827661  1.000000000
quantile(abs(cor_pearson_mat), na.rm = T)
##           0%          25%          50%          75%         100% 
## 1.386344e-18 2.406502e-03 6.309352e-03 1.379830e-02 1.000000e+00
# plot the smallest correlations
cor_pearson_vec <- sort(abs(cor_pearson_mat), decreasing = T)
plot(cor_pearson_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_pearson_mat) == cor_pearson_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_pearson_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_pearson_mat) == rev(cor_pearson_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_pearson_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[1]]

2. Spearman’s correlation coefficient

  • It captures the monotonic relationship between data, X and Y
  • \(-1 \leq \rho_{Spearman}(X,Y) \leq 1\)
  • \(\rho_{Spearman} = 1 - \frac{6\sum{d_i^2}}{n(n^2-1)}\) where \(d_i\) is the difference between the ranks of \(x_i\) and \(y_i\)
cor_spearman_mat[1:5,1:5]
##            IGKC      HBA2       HBB      HBA1 IGHA1
## IGKC         NA        NA        NA        NA    NA
## HBA2  0.3868128        NA        NA        NA    NA
## HBB   0.4134363 0.9407810        NA        NA    NA
## HBA1  0.4033735 0.9474558 0.9286280        NA    NA
## IGHA1 0.5538631 0.2751407 0.3012491 0.2677616    NA
quantile(cor_spearman_mat, na.rm = T)
##           0%          25%          50%          75%         100% 
## -0.861926228 -0.001335261  0.020749572  0.221332870  1.000000000
quantile(abs(cor_spearman_mat), na.rm = T)
##          0%         25%         50%         75%        100% 
## 0.000000000 0.007690792 0.032266296 0.231727174 1.000000000
# plot the smallest correlations
cor_spearman_vec <- sort(abs(cor_spearman_mat), decreasing = T)
plot(cor_spearman_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_spearman_mat) == cor_spearman_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_spearman_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_spearman_mat) == rev(cor_spearman_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_spearman_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[2]]

3. Kendall’s Tau

  • It is an alternative method to Spearman’s correlations, identifying monotonic relationships.
  • \(-1 \leq \rho_{Kendall}(X,Y) \leq 1\)
  • \(\rho_{Kendall}(X,Y) = \frac{\#\;concordant\;pairs - \#\;discordant \;pairs}{0.5n(n-1)}\)
cor_kendall_mat[1:5,1:5]
##            IGKC      HBA2       HBB      HBA1 IGHA1
## IGKC         NA        NA        NA        NA    NA
## HBA2  0.2660973        NA        NA        NA    NA
## HBB   0.2851550 0.7874437        NA        NA    NA
## HBA1  0.2778262 0.8024215 0.7666779        NA    NA
## IGHA1 0.3911323 0.1864774 0.2049764 0.1812993    NA
quantile(cor_kendall_mat, na.rm = T)
##           0%          25%          50%          75%         100% 
## -0.673362647 -0.001300303  0.019174624  0.202522688  1.000000000
quantile(abs(cor_kendall_mat), na.rm = T)
##          0%         25%         50%         75%        100% 
## 0.000000000 0.007219801 0.030312714 0.210479749 1.000000000
# plot the smallest correlations
cor_kendall_vec <- sort(abs(cor_kendall_mat), decreasing = T)
plot(cor_kendall_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_kendall_mat) == cor_kendall_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_kendall_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_kendall_mat) == rev(cor_kendall_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_kendall_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[3]]

4. Hoeffding’s D

  • It tests the independence of data by calculating the distance between the product of the marginal distributions under the null hypothesis and the empirical bi-variate distribution.
  • \(-1 \leq D(X,Y) \leq 1\)
  • \(D(X,Y) = \frac{(n-2)(n-3)D_1+D_2-2(n-2)D_3}{n(n-1)(n-2)(n-3)(n-4)}\)
    • \(D_1 = \sum_{i=1}^{n} Q_i(Q_i-1)\)
    • \(D_2 = \sum_{i=1}^{n} (R_i-1)(R_i-2)(S_j-1)(S_j-2)\)
    • \(D_3 = \sum_{i=1}^{n} (R_i-2)(S_i-2)Q_i\)
cor_hoeffd_mat[1:5,1:5]
##             IGKC       HBA2        HBB       HBA1 IGHA1
## IGKC          NA         NA         NA         NA    NA
## HBA2  0.04547920         NA         NA         NA    NA
## HBB   0.05205833 0.52231121         NA         NA    NA
## HBA1  0.04956955 0.54588797 0.48581086         NA    NA
## IGHA1 0.10250330 0.02191421 0.02636821 0.02063613    NA
quantile(cor_hoeffd_mat, na.rm = T)
##            0%           25%           50%           75%          100% 
## -0.0004169447 -0.0003534625 -0.0002814756  0.0031924664  0.5588614687
# plot the smallest correlations
cor_hoeffd_vec <- sort(abs(cor_hoeffd_mat), decreasing = T)
plot(cor_hoeffd_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_hoeffd_mat) == cor_hoeffd_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_hoeffd_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_hoeffd_mat) == rev(cor_hoeffd_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_hoeffd_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[4]]

5. Blomqvist’s Beta

  • It measures dependency between variables by constructing a two-way contingency table with the medians of each margin as cutting points.
  • \(0 \leq \beta \leq 1\)
  • \(\beta_n = \frac{n_1-n_2}{n_1+n_2} = \frac{2n_1}{n_1+n_2} - 1\)
  • \(\beta = P\{(X-\tilde{x})(Y-\tilde{y})>0\} - P\{(X-\tilde{x})(Y-\tilde{y}) < 0\}\)
cor_blomqvist_mat[1:5,1:5]
##            [,1]       [,2]       [,3]      [,4] [,5]
## [1,]         NA         NA         NA        NA   NA
## [2,] -0.5688889         NA         NA        NA   NA
## [3,]  0.8671111 -0.4600000         NA        NA   NA
## [4,] -0.9071111  0.6600000 -0.7982222        NA   NA
## [5,] -0.9182222  0.5857778 -0.8111111 0.8844444   NA
quantile(cor_blomqvist_mat, na.rm = T)
##   0%  25%  50%  75% 100% 
##   -1    1    1    1    1
quantile(abs(cor_blomqvist_mat), na.rm = T)
##          0%         25%         50%         75%        100% 
## 0.001333333 1.000000000 1.000000000 1.000000000 1.000000000
# plot the smallest correlations
cor_blomqvist_vec <- sort(abs(cor_blomqvist_mat), decreasing = T)
plot(cor_blomqvist_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_blomqvist_mat) == cor_blomqvist_vec[i], arr.ind = T)
 idx1 <- idx[i,1]; idx2 <- idx[i,2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_blomqvist_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_blomqvist_mat) == rev(cor_blomqvist_vec)[i], arr.ind = T)
 idx1 <- idx[1,1]; idx2 <- idx[1,2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_blomqvist_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[5]]
## NULL

6. Distance Correlation

  • it is measure to identify non-linear relationships between two random variables with energy distances.
  • distance correlation is calculated by dividing the distance covariance between X and Y by the product of their distance standard deviations.
  • \(0 \leq dCor \leq 1\)
  • \(dCor(X,Y) = \frac{dCov(Y,Y)}{\sqrt{dVar(X)dVar(Y)}}\)
    • \(dCov(X, Y) = \sqrt{\frac{1}{n^2} \sum_{k=1, l=1}^{n} A_{k,l}B_{k,l}}\)
    • \(dVar(X) = dCov(X,X) and dVar(Y) = dCov(Y, Y)\)
cor_dist_mat[1:5,1:5]
##           [,1]       [,2]       [,3]      [,4] [,5]
## [1,]        NA         NA         NA        NA   NA
## [2,] 0.1153987         NA         NA        NA   NA
## [3,] 0.1183309 0.84974651         NA        NA   NA
## [4,] 0.1342292 0.86772216 0.80209622        NA   NA
## [5,] 0.2526241 0.05499639 0.05971892 0.0548047   NA
quantile(cor_dist_mat, na.rm = T)
##           0%          25%          50%          75%         100% 
## 0.000000e+00 7.085565e-05 4.321211e-04 5.439166e-03 1.000000e+00
# plot the smallest correlations
cor_dist_vec <- sort(abs(cor_dist_mat), decreasing = T)
plot(cor_dist_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_dist_mat) == cor_dist_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_dist_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_dist_mat) == rev(cor_dist_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_dist_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[6]]

7. Normalized Mutual Information

  • It measures how much one random variable gives information about the other. For example, High mutual information indicates a large reduction in uncertainty.
  • \(0 \leq MI(X,Y) \leq 1\), as it is normalized.
  • $MI(X,Y) = f_{X,Y} (x,y) log_2 ; dxdy $
  • \(MI(X,Y) = \sum \sum p_{X,Y} (x,y) log \frac{p_{X,Y} (x,y)}{P_X(x)P_Y(y)}\)
cor_MI_mat[1:5,1:5]
##           [,1]       [,2]       [,3]       [,4] [,5]
## [1,]        NA         NA         NA         NA   NA
## [2,] 0.0440143         NA         NA         NA   NA
## [3,] 0.0440143 1.00000000         NA         NA   NA
## [4,] 0.0440143 1.00000000 1.00000000         NA   NA
## [5,] 0.2136502 0.05290446 0.05290446 0.05290446   NA
quantile(cor_MI_mat, na.rm = T)
##            0%           25%           50%           75%          100% 
## -1.119380e-18  9.092741e-03  2.832558e-02  7.751096e-02  1.000000e+00
# plot the smallest correlations
cor_MI_vec <- sort(abs(cor_MI_mat), decreasing = T)
plot(cor_MI_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_MI_mat) == cor_MI_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_MI_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_MI_mat) == rev(cor_MI_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_MI_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[7]]

8. Chatterjee’s XI Correlation

  • It measures the degree of dependence between the variables with concept of rank.
  • \(0 \leq \xi_n \leq 1\)
  • \(\xi_n(X,Y) = 1 - \frac{3\sum_{i=1}^{n-1} |r_{i+1} -r_i|}{n^2-1}\)
  • \(\xi_n(X,Y) = 1 - \frac{n\sum_{i=1}^{n-1}|r_{i+1}-r_i}{2\sum_{i=1}^{n}l_i(n-l_i)}\)
cor_XI_mat[1:5,1:5]
##           [,1]       [,2]       [,3]       [,4] [,5]
## [1,]        NA         NA         NA         NA   NA
## [2,] 0.1009619         NA         NA         NA   NA
## [3,] 0.1125111 0.68139610         NA         NA   NA
## [4,] 0.1014173 0.70251135 0.65458743         NA   NA
## [5,] 0.2087703 0.03512883 0.05167311 0.03598037   NA
quantile(cor_XI_mat, na.rm = T)
##            0%           25%           50%           75%          100% 
## -0.0415879994 -0.0003705076  0.0218764219  0.2681462148  0.9966268892
# plot the smallest correlations
cor_XI_vec <- sort(abs(cor_XI_mat), decreasing = T)
plot(cor_XI_vec)

#plot the high correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_XI_mat) == cor_XI_vec[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_XI_mat[idx1, idx2], 3)))
}

#plot the lowest correlations
par(mfrow = c(2,2))
for(i in 1:4){
 idx <- which(abs(cor_XI_mat) == rev(cor_XI_vec)[i], arr.ind = T)
 idx1 <- idx[1]; idx2 <- idx[2]
 
 plot(saver_mat[,idx1], saver_mat[,idx2],   asp = T,
      pch = 16, xlab = paste0(colnames(saver_mat)[idx1], ", (", idx1, ")"),
      ylab = paste0(colnames(saver_mat)[idx2], ", (", idx2, ")"), 
      main = paste0("Correlation of ", round(cor_XI_mat[idx1, idx2], 3)))
}

Heatmap

heatmap_list[[1]][[8]]

quantile_mat <- c()
for (i in 1:length(cormat_list)){
  quantile_mat <- rbind(quantile_mat,
                        quantile(abs(cormat_list[[i]]), probs = c(0.10, 0.95), na.rm = T))
}
rownames(quantile_mat) <- c("Pearson", "Spearman", "Kendall", "Hoeffding's D",
                            "Blomqvist's Beta","Dist. Corr", "NMI", "XI Corr")

quantile_mat
##                           10%        95%
## Pearson          8.844389e-04 0.03502224
## Spearman         1.144231e-03 0.57049810
## Kendall          1.114172e-03 0.52502885
## Hoeffding's D    2.733574e-04 0.05633377
## Blomqvist's Beta 9.995556e-01 1.00000000
## Dist. Corr       2.513364e-03 0.29575625
## NMI              4.029217e-06 0.04152404
## XI Corr          3.705076e-04 0.73673423
# save(quantile_mat, store_date, file = "CD4_seurat_quantile2.RData")